In [1]:
cd ../..
/Users/shanekercheval/repos/data-science-template
In [2]:
%run "source/config/notebook_settings.py"
import os
import mlflow
from mlflow.tracking import MlflowClient
from helpsk.utility import read_pickle
import helpsk as hlp

from source.library.utilities import Timer, log_info, get_config

config = get_config()
mlflow_uri = config['MLFLOW']['URI']
log_info(f"MLFlow URI: {mlflow_uri}")

client = MlflowClient(tracking_uri='http://127.0.0.1:1234')
2022-06-10 14:49:52 - INFO     | MLFlow URI: http://127.0.0.1:1234

Get Latest Experiment Run from MLFlow¶

In [3]:
credit_experiment = client.get_experiment_by_name(name=config['MLFLOW']['EXPERIMENT_NAME'])
runs = client.list_run_infos(experiment_id=credit_experiment.experiment_id)
latest_run = runs[np.argmax([x.start_time for x in runs])]
In [4]:
yaml_path = client.download_artifacts(run_id=latest_run.run_id, path='experiment.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = yaml_path)
In [5]:
best_estimator = read_pickle(client.download_artifacts(
    run_id=latest_run.run_id,
    path='experiment_best_estimator.pkl'
))
In [6]:
best_estimator
Out[6]:
Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   TransformerChooser(transformer=SimpleImputer())),
                                                                  ('scaler',
                                                                   TransformerChooser()),
                                                                  ('pca',
                                                                   TransformerChooser())]),
                                                  ['duration', 'credit_amount',
                                                   'installment_commitment',
                                                   'residence_since', 'age',
                                                   'existing_credits',
                                                   'num_dependents']),
                                                 ('non_numeric',
                                                  Pipeline(steps...,
                                                                   TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
                                                  ['checking_status',
                                                   'credit_history', 'purpose',
                                                   'savings_status',
                                                   'employment',
                                                   'personal_status',
                                                   'other_parties',
                                                   'property_magnitude',
                                                   'other_payment_plans',
                                                   'housing', 'job',
                                                   'own_telephone',
                                                   'foreign_worker'])])),
                ('model',
                 RandomForestClassifier(n_estimators=500, random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   TransformerChooser(transformer=SimpleImputer())),
                                                                  ('scaler',
                                                                   TransformerChooser()),
                                                                  ('pca',
                                                                   TransformerChooser())]),
                                                  ['duration', 'credit_amount',
                                                   'installment_commitment',
                                                   'residence_since', 'age',
                                                   'existing_credits',
                                                   'num_dependents']),
                                                 ('non_numeric',
                                                  Pipeline(steps...,
                                                                   TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
                                                  ['checking_status',
                                                   'credit_history', 'purpose',
                                                   'savings_status',
                                                   'employment',
                                                   'personal_status',
                                                   'other_parties',
                                                   'property_magnitude',
                                                   'other_payment_plans',
                                                   'housing', 'job',
                                                   'own_telephone',
                                                   'foreign_worker'])])),
                ('model',
                 RandomForestClassifier(n_estimators=500, random_state=42))])
ColumnTransformer(transformers=[('numeric',
                                 Pipeline(steps=[('imputer',
                                                  TransformerChooser(transformer=SimpleImputer())),
                                                 ('scaler',
                                                  TransformerChooser()),
                                                 ('pca',
                                                  TransformerChooser())]),
                                 ['duration', 'credit_amount',
                                  'installment_commitment', 'residence_since',
                                  'age', 'existing_credits',
                                  'num_dependents']),
                                ('non_numeric',
                                 Pipeline(steps=[('encoder',
                                                  TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore')))]),
                                 ['checking_status', 'credit_history',
                                  'purpose', 'savings_status', 'employment',
                                  'personal_status', 'other_parties',
                                  'property_magnitude', 'other_payment_plans',
                                  'housing', 'job', 'own_telephone',
                                  'foreign_worker'])])
['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
TransformerChooser(transformer=SimpleImputer())
SimpleImputer()
SimpleImputer()
TransformerChooser()
TransformerChooser()
['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
TransformerChooser(transformer=OneHotEncoder(handle_unknown='ignore'))
OneHotEncoder(handle_unknown='ignore')
OneHotEncoder(handle_unknown='ignore')
RandomForestClassifier(n_estimators=500, random_state=42)

Training & Test Data Info¶

In [7]:
client.download_artifacts(run_id=latest_run.run_id, path='x_train.pkl')
Out[7]:
'/Users/shanekercheval/repos/data-science-template/mlflow-artifact-root/1/fc34554fdaeb406f9123563a6f2a22d7/artifacts/x_train.pkl'
In [8]:
with Timer("Loading training/test datasets"):
    X_train = pd.pandas.read_pickle(client.download_artifacts(run_id=latest_run.run_id, path='x_train.pkl'))
    X_test = pd.pandas.read_pickle(client.download_artifacts(run_id=latest_run.run_id, path='x_test.pkl'))
    y_train = pd.pandas.read_pickle(client.download_artifacts(run_id=latest_run.run_id, path='y_train.pkl'))
    y_test = pd.pandas.read_pickle(client.download_artifacts(run_id=latest_run.run_id, path='y_test.pkl'))
2022-06-10 14:49:52 - INFO     | *****Timer Started: Loading training/test datasets
2022-06-10 14:49:52 - INFO     | *****Timer Finished (0.00 seconds)
In [9]:
log_info(X_train.shape)
log_info(len(y_train))

log_info(X_test.shape)
log_info(len(y_test))
2022-06-10 14:49:52 - INFO     | (800, 20)
2022-06-10 14:49:52 - INFO     | 800
2022-06-10 14:49:52 - INFO     | (200, 20)
2022-06-10 14:49:52 - INFO     | 200
In [10]:
np.unique(y_train, return_counts=True)
Out[10]:
(array([0, 1]), array([546, 254]))
In [11]:
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
Out[11]:
array([0.6825, 0.3175])
In [12]:
np.unique(y_test, return_counts=True)[1] / np.sum(np.unique(y_test, return_counts=True)[1])
Out[12]:
array([0.77, 0.23])

Cross Validation Results¶

Best Scores/Params¶

In [13]:
log_info(f"Best Score: {results.best_score}")
2022-06-10 14:49:52 - INFO     | Best Score: 0.7759520520207864
In [14]:
log_info(f"Best Params: {results.best_params}")
2022-06-10 14:49:52 - INFO     | Best Params: {'model': 'RandomForestClassifier()', 'imputer': 'SimpleImputer()', 'scaler': 'None', 'pca': 'None', 'encoder': 'OneHotEncoder()'}
In [15]:
# Best model from each model-type.
df = results.to_formatted_dataframe(return_style=False, include_rank=True)
df["model_rank"] = df.groupby("model")["roc_auc Mean"].rank(method="first", ascending=False)
df.query('model_rank == 1')
Out[15]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI model C max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion learning_rate min_child_weight subsample colsample_bytree colsample_bylevel reg_alpha reg_lambda imputer scaler pca encoder model_rank
10 1 0.78 0.75 0.80 RandomForestClassifier() NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN SimpleImputer() None None OneHotEncoder() 1.00
5 2 0.77 0.77 0.78 ExtraTreesClassifier() NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN SimpleImputer() None None OneHotEncoder() 1.00
0 3 0.77 0.75 0.80 LogisticRegression() NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN SimpleImputer() StandardScaler() None OneHotEncoder() 1.00
19 7 0.76 0.72 0.80 XGBClassifier() NaN NaN 2.00 1095.00 NaN NaN NaN NaN 0.04 6.00 0.88 0.75 0.56 0.04 1.09 SimpleImputer(strategy='median') None None OneHotEncoder() 1.00
In [16]:
results.to_formatted_dataframe(return_style=True,
                               include_rank=True,
                               num_rows=500)
Out[16]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI model C max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion learning_rate min_child_weight subsample colsample_bytree colsample_bylevel reg_alpha reg_lambda imputer scaler pca encoder
1 0.776 0.753 0.799 RandomForestClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
2 0.773 0.769 0.776 ExtraTreesClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
3 0.772 0.747 0.797 LogisticRegression() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
4 0.772 0.751 0.794 ExtraTreesClassifier() <NA> 0.776 55.000 1,390.000 17.000 5.000 0.556 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None PCA('mle') CustomOrdinalEncoder()
5 0.764 0.716 0.812 ExtraTreesClassifier() <NA> 0.077 49.000 1,699.000 38.000 11.000 0.605 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') None None CustomOrdinalEncoder()
6 0.759 0.706 0.811 RandomForestClassifier() <NA> 0.220 4.000 1,656.000 27.000 6.000 0.916 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None None OneHotEncoder()
7 0.758 0.719 0.796 XGBClassifier() <NA> <NA> 2.000 1,095.000 <NA> <NA> <NA> <NA> 0.043 6.000 0.875 0.748 0.561 0.044 1.086 SimpleImputer(strategy='median') None None OneHotEncoder()
8 0.752 0.707 0.797 RandomForestClassifier() <NA> 0.424 17.000 538.000 19.000 19.000 0.725 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None PCA('mle') CustomOrdinalEncoder()
9 0.750 0.688 0.812 RandomForestClassifier() <NA> 0.637 99.000 1,218.000 43.000 15.000 0.662 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None PCA('mle') CustomOrdinalEncoder()
10 0.749 0.734 0.764 XGBClassifier() <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None OneHotEncoder()
11 0.740 0.702 0.779 ExtraTreesClassifier() <NA> 0.770 13.000 739.000 29.000 27.000 0.725 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None PCA('mle') CustomOrdinalEncoder()
12 0.739 0.666 0.812 RandomForestClassifier() <NA> 0.743 82.000 1,323.000 19.000 31.000 0.588 entropy <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None None CustomOrdinalEncoder()
13 0.737 0.697 0.777 ExtraTreesClassifier() <NA> 0.778 24.000 1,816.000 7.000 45.000 0.768 gini <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='most_frequent') None None CustomOrdinalEncoder()
14 0.734 0.699 0.769 LogisticRegression() 0.019 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() None CustomOrdinalEncoder()
15 0.730 0.712 0.749 LogisticRegression() 0.339 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') MinMaxScaler() PCA('mle') CustomOrdinalEncoder()
16 0.730 0.599 0.860 LogisticRegression() 0.000 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') StandardScaler() None OneHotEncoder()
17 0.725 0.665 0.785 XGBClassifier() <NA> <NA> 3.000 1,237.000 <NA> <NA> <NA> <NA> 0.224 8.000 0.737 0.682 0.917 0.157 1.278 SimpleImputer(strategy='most_frequent') None None OneHotEncoder()
18 0.724 0.691 0.756 XGBClassifier() <NA> <NA> 2.000 1,891.000 <NA> <NA> <NA> <NA> 0.073 5.000 0.677 0.913 0.792 0.001 1.994 SimpleImputer(strategy='median') None PCA('mle') OneHotEncoder()
19 0.711 0.667 0.755 XGBClassifier() <NA> <NA> 2.000 1,618.000 <NA> <NA> <NA> <NA> 0.070 19.000 0.714 0.880 0.665 0.127 1.638 SimpleImputer(strategy='median') None PCA('mle') OneHotEncoder()
20 0.710 0.684 0.735 LogisticRegression() 0.002 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer(strategy='median') MinMaxScaler() None CustomOrdinalEncoder()
In [17]:
results.to_formatted_dataframe(query='model == "RandomForestClassifier()"', include_rank=True)
Out[17]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI max_features max_depth n_estimators min_samples_split min_samples_leaf max_samples criterion imputer pca encoder
1 0.776 0.753 0.799 <NA> <NA> <NA> <NA> <NA> <NA> <NA> SimpleImputer() None OneHotEncoder()
2 0.759 0.706 0.811 0.220 4.000 1,656.000 27.000 6.000 0.916 gini SimpleImputer(strategy='most_frequent') None OneHotEncoder()
3 0.752 0.707 0.797 0.424 17.000 538.000 19.000 19.000 0.725 entropy SimpleImputer() PCA('mle') CustomOrdinalEncoder()
4 0.750 0.688 0.812 0.637 99.000 1,218.000 43.000 15.000 0.662 gini SimpleImputer(strategy='most_frequent') PCA('mle') CustomOrdinalEncoder()
5 0.739 0.666 0.812 0.743 82.000 1,323.000 19.000 31.000 0.588 entropy SimpleImputer() None CustomOrdinalEncoder()
In [18]:
results.to_formatted_dataframe(query='model == "LogisticRegression()"', include_rank=True)
Out[18]:
rank roc_auc Mean roc_auc 95CI.LO roc_auc 95CI.HI C imputer scaler pca encoder
1 0.772 0.747 0.797 <NA> SimpleImputer() StandardScaler() None OneHotEncoder()
2 0.734 0.699 0.769 0.019 SimpleImputer(strategy='median') StandardScaler() None CustomOrdinalEncoder()
3 0.730 0.712 0.749 0.339 SimpleImputer(strategy='median') MinMaxScaler() PCA('mle') CustomOrdinalEncoder()
4 0.730 0.599 0.860 0.000 SimpleImputer(strategy='median') StandardScaler() None OneHotEncoder()
5 0.710 0.684 0.735 0.002 SimpleImputer(strategy='median') MinMaxScaler() None CustomOrdinalEncoder()

BayesSearchCV Performance Over Time¶

In [19]:
results.plot_performance_across_trials(facet_by='model').show()
In [20]:
results.plot_performance_across_trials(query='model == "RandomForestClassifier()"').show()

Variable Performance Over Time¶

In [21]:
results.plot_parameter_values_across_trials(query='model == "RandomForestClassifier()"').show()

Scatter Matrix¶

In [22]:
# results.plot_scatter_matrix(query='model == "RandomForestClassifier()"',
#                             height=1000, width=1000).show()

Variable Performance - Numeric¶

In [23]:
results.plot_performance_numeric_params(query='model == "RandomForestClassifier()"',
                                        height=800)
/Users/shanekercheval/repos/data-science-template/.venv/lib/python3.9/site-packages/statsmodels/nonparametric/smoothers_lowess.py:227: RuntimeWarning:

invalid value encountered in true_divide

In [24]:
results.plot_parallel_coordinates(query='model == "RandomForestClassifier()"').show()

Variable Performance - Non-Numeric¶

In [25]:
results.plot_performance_non_numeric_params(query='model == "RandomForestClassifier()"').show()

In [26]:
results.plot_score_vs_parameter(
    query='model == "RandomForestClassifier()"',
    parameter='max_features',
    size='max_depth',
    color='encoder',
)

In [27]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='max_depth'
# )
In [28]:
# results.plot_parameter_vs_parameter(
#     query='model == "XGBClassifier()"',
#     parameter_x='colsample_bytree',
#     parameter_y='learning_rate',
#     size='imputer'
# )

Best Model - Test Set Performance¶

In [29]:
test_predictions = best_estimator.predict_proba(X_test)[:, 1]
test_predictions[0:10]
Out[29]:
array([0.13 , 0.522, 0.456, 0.554, 0.066, 0.294, 0.054, 0.302, 0.056,
       0.306])
In [30]:
evaluator = hlp.sklearn_eval.TwoClassEvaluator(
    actual_values=y_test,
    predicted_scores=test_predictions,
    score_threshold=0.37
)
In [31]:
evaluator.plot_actual_vs_predict_histogram()
In [32]:
evaluator.plot_confusion_matrix()
In [33]:
evaluator.all_metrics_df(return_style=True,
                         dummy_classifier_strategy=['prior', 'constant'],
                         round_by=3)
Out[33]:
  Score Dummy (prior) Dummy (constant) Explanation
AUC 0.785 0.500 0.500 Area under the ROC curve (true pos. rate vs false pos. rate); ranges from 0.5 (purely random classifier) to 1.0 (perfect classifier)
True Positive Rate 0.804 0.000 1.000 80.4% of positive instances were correctly identified.; i.e. 37 "Positive Class" labels were correctly identified out of 46 instances; a.k.a Sensitivity/Recall
True Negative Rate 0.682 1.000 0.000 68.2% of negative instances were correctly identified.; i.e. 105 "Negative Class" labels were correctly identified out of 154 instances
False Positive Rate 0.318 0.000 1.000 31.8% of negative instances were incorrectly identified as positive; i.e. 49 "Negative Class" labels were incorrectly identified as "Positive Class", out of 154 instances
False Negative Rate 0.196 1.000 0.000 19.6% of positive instances were incorrectly identified as negative; i.e. 9 "Positive Class" labels were incorrectly identified as "Negative Class", out of 46 instances
Positive Predictive Value 0.430 0.000 0.230 When the model claims an instance is positive, it is correct 43.0% of the time; i.e. out of the 86 times the model predicted "Positive Class", it was correct 37 times; a.k.a precision
Negative Predictive Value 0.921 0.770 0.000 When the model claims an instance is negative, it is correct 92.1% of the time; i.e. out of the 114 times the model predicted "Negative Class", it was correct 105 times
F1 Score 0.561 0.000 0.374 The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0.
Precision/Recall AUC 0.512 0.230 0.230 Precision/Recall AUC is calculated with `average_precision` which summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold. See sci-kit learn documentation for caveats.
Accuracy 0.710 0.770 0.230 71.0% of instances were correctly identified
Error Rate 0.290 0.230 0.770 29.0% of instances were incorrectly identified
% Positive 0.230 0.230 0.230 23.0% of the data are positive; i.e. out of 200 total observations; 46 are labeled as "Positive Class"
Total Observations 200 200 200 There are 200 total observations; i.e. sample size
In [34]:
evaluator.plot_roc_auc_curve().show()
<Figure size 720x444.984 with 0 Axes>
In [35]:
evaluator.plot_precision_recall_auc_curve().show()
In [36]:
evaluator.plot_threshold_curves(score_threshold_range=(0.1, 0.7)).show()
In [37]:
evaluator.plot_precision_recall_tradeoff(score_threshold_range=(0.1, 0.6)).show()
In [38]:
evaluator.calculate_lift_gain(return_style=True)
Out[38]:
  Gain Lift
Percentile    
5 0.11 2.17
10 0.24 2.39
15 0.33 2.17
20 0.43 2.17
25 0.48 1.91
30 0.61 2.03
35 0.72 2.05
40 0.78 1.96
45 0.83 1.84
50 0.85 1.70
55 0.89 1.62
60 0.89 1.49
65 0.91 1.40
70 0.93 1.34
75 0.93 1.25
80 0.93 1.17
85 0.98 1.15
90 1.00 1.11
95 1.00 1.05
100 1.00 1.00